***************************************************************
* Title: rwanda_merge_jde.do
* Author: Todd Pugatch
* Last update: June 10 2024
* Description: data cleaning for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
*
* Inputs: 	headteacher_baseline_clean_jde.dta
*			headteacher_endline_clean_jde.dta
*			teacher_baseline_clean_jde.dta
*			teacher_endline_clean_jde.dta
*			teacherobs_endline_jde.dta
*			student_baseline_clean_jde.dta
*			student_endline_clean_jde.dta
*
* Outputs: 	headteacher_merge_jde.dta
*			teacher_merge_jde.dta
*			student_merge_jde.dta
*
* Notes: produced in preparation for JDE Registered Report Stage 2
*	--merges baseline (2016) and endline (2018) data only
*	--includes following baseline and endline data (saved as separate panels):
*		--head teacher survey
*		--teacher survey
*			--includes teacher classroom observation (endline only)
*		--student survey
*	--run these do-files to clean & modify data prior to conducting this merge:
*		rwanda_headteacher_baseline_jde.do
*		rwanda_headteacher_endline_jde.do
*		rwanda_teacher_baseline_jde.do
*		rwanda_teacher_endline_jde.do
*		rwanda_teacherobs_endline_jde.do
*		rwanda_student_baseline_jde.do
*		rwanda_student_endline_jde.do
****************************************************************

* Set environment
local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"

**********************************************
* 			HEAD TEACHER SURVEYS		     *
**********************************************
* load baseline head teacher survey

qui use "$cleandata/headteacher_baseline_clean_jde.dta", clear

* merge with endline head teacher survey
merge 1:1 school_code using "$cleandata/headteacher_endline_clean_jde.dta"

* investigate merge quality
/*3 schools did not complete HT endline survey*/
sort _merge school_code
list school_code schoolname_bl schoolname_el _merge, nolabel
assert schoolname_bl==schoolname_el if _merge==3

* update "in sample" variables based on merge
foreach x in bl el {
	qui replace insample_`x'=0 if insample_`x'==.
}


* save data
qui compress
lab data "Head teacher survey, merged baseline (2016) & endline (2018)"
qui save "$cleandata/headteacher_merge_jde.dta", replace

**********************************************
* 				TEACHER SURVEYS				 *
**********************************************
* first, merge baseline & endline teacher surveys
/*Note that baseline teacher survey had no teacher ID variable. PIs added teacherid based on 
	endline_teacherid_creation_protocol.docx.
	Baseline teacherid based on fact that only one teacher per school surveyed at baseline. In endline,
	surveyed teacher who were also surveyed at baseline were given ID with final digit "1." Specifically,
	they were assigned teacherid = schoolid*10 + 1.*/
* load baseline teacher survey
qui use "$cleandata/teacher_baseline_clean_jde.dta", clear

* merge by teacher id with endline teacher survey
merge 1:m school_code teacherid using "$cleandata/teacher_endline_clean_jde.dta", force
ren _merge merge_surveys
lab var merge_surveys "result of merge between baseline and endline teacher survey (merged by school_code only)"
lab def merge_surveys 	1 "school_code in baseline survey only" ///
						2 "school_code in endline survey only"	///
						3 "school_code in both baseline & endline surveys"
lab val merge_surveys merge_surveys

/*********************************************
investigate merge quality:

	1. how many teachers claim to be in baseline but have no baseline survey match?
		--inbaseline_el=1, merge_surveys!=3
	2. how many teachers claim not to be in baseline survey but have a baseline survey match?
		--inbaseline_el=0, merge_surveys=3
**********************************************/
tab merge_surveys inbaseline_el, mi

* next, merge teacher endline survey with teacher endline observation, by teacherid & school_code
merge 1:1 school_code teacherid using "$cleandata/teacherobs_endline_jde.dta" 

ren _merge merge_endline
lab var merge_endline "result of merge between endline teacher survey & observation (merged by school_code & teacherid)"
lab def merge_endline 	1 "in endline survey only" 			///
						2 "in endline observation only" 	///
						3 "in both endline survey & observation"
lab val merge_endline merge_endline

* update "in sample" variables at endline
qui replace insample_bl=0 if insample_bl==.
qui replace insample_el=0 if insample_el==.
qui replace insample_obs_el=0 if insample_obs_el==.

* update alternative definitions of treatment for those not matched between merges
foreach x in unassgn educateassgnt {
	qui bysort school_code: egen treatment_`x'x=mean(treatment_`x')
	qui replace treatment_`x'=treatment_`x'x if treatment_`x'==.
	drop treatment_`x'x
}

* drop variables to preserve privacy
*label drop enumename

* save data
qui compress
lab data "Teacher survey, merged baseline (2016) & endline (2018), with endline classroom observation"
qui save "$cleandata/teacher_merge_jde.dta", replace

* STUDENT SURVEYS
* merge baseline & endline student surveys by student ID (created from "uniqueid" in both datasets)
/*can't merge on school_code also because students drop out or transfer schools between surveys*/
* load baseline student survey
qui use "$cleandata/student_baseline_clean_jde.dta", clear

* merge with endline student survey
merge 1:1 studentid using "$cleandata/student_endline_clean_jde.dta", force

* does merge status match "baseline_student" variable from endline?
tab _merge baseline_student, mi

/*false negatives: students said to be in baseline, but without baseline data*/
sort school_code studentid
list studentid school_code schoolid school_name school school_code_106 if baseline_student==1 & _merge==2

/*false positives: students with matched baseline & endline studentid. but baseline_student=0*/
sort school_code studentid
list studentid school_code schoolid school_name school if baseline_student==0 & _merge==3

* update "in sample" variables based on merge
foreach x in bl el {
	qui replace insample_`x'=0 if insample_`x'==.
}
ren _merge merge_blel
lab var merge_blel "result of student baseline/endline merge"

* drop students added at endline
qui drop if insample_bl==0

/*define transfer students: 
	--in baseline and endline surveys
	--baseline and endline school codes don't match
	--not a dropout*/
qui gen transfer_el=(insample_el==1 & insample_bl==1 & school_code_el!=school_code & dropout==0)
qui replace transfer_el=. if insample_el==0
lab var transfer_el "student transferred schools between baseline and endline"


qui compress
lab data "Student survey, merged baseline (2016) & endline (2018), drops students added at endline"
qui save "$cleandata/student_merge_jde.dta", replace

local end=`"$S_TIME"' 
di "`start'"
di "`end'"
